{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### 将数据集划分成训练集、验证集、测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def split_datasets():\n",
    "    iris = load_iris()\n",
    "    feature = iris.data                  #样本特征集合\n",
    "    target = iris.target                # 样本标签\n",
    "    print(\"特征值：\", type(feature))\n",
    "    print(\"目标值：\", type(target))\n",
    "    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)  # 传的参数必须是numpy.ndarray或者pandas.dataframes，但是必须是传入特征值和目标值，不能一起传入\n",
    "    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n",
    "    print(\"训练集：\", x_train.shape, y_train.shape)\n",
    "    print(\"验证集：\", x_val.shape, y_val.shape)\n",
    "    print(\"测试集：\", x_test.shape, y_test.shape)\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    split_datasets()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过将原始数据分为3个数据集合，我们就大大减少了可用于模型学习的样本数量， 并且得到的结果依赖于集合对（训练，验证）的随机选择。\n",
    "使用交叉验证最简单的方法是在估计器和数据集上调用 cross_val_score 辅助函数。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#下面的示例展示了如何通过分割数据，拟合模型和计算连续 5 次的分数（每次不同分割）来估计 linear kernel 支持向量机在 iris 数据集上的精度:\n",
    "from sklearn.model_selection import cross_val_score\n",
    "clf = svm.SVC(kernel='linear', C=1)\n",
    "scores = cross_val_score(clf, iris.data, iris.target, cv=5)\n",
    "print(scores) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#标准化：\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler().fit(x_train) # fit生成规则\n",
    "x_trainScaler = scaler.transform(x_train) # 将规则应用于训练集\n",
    "x_testScaler = scaler.transform(x_test)  # 将规则应用于测试集\n",
    "\n",
    "#归一化：\n",
    "from sklearn.preprocessing import Normalizer\n",
    "scaler = Normalizer().fit(x_train) # fit生成规则\n",
    "x_trainScaler = scaler.transform(x_train) # 将规则应用于训练集\n",
    "x_testScaler = scaler.transform(x_test)  # 将规则应用于测试集\n",
    "\n",
    "# 二值化：设定一个阈值，大于阈值的赋值为1，小于等于阈值的赋值为0\n",
    "from sklearn.preprocessing import Binarizer\n",
    "scaler = Binarizer(threshold=3).fit(x_train) # threshold为设定的阀值\n",
    "x_trainScaler = scaler.transform(x_train) # 将规则应用于训练集\n",
    "x_testScaler = scaler.transform(x_test)  # 将规则应用于测试集\n",
    "\n",
    "#哑编码处理：\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "scaler = OneHotEncoder().fit(x_train.reshape((-1,1)))\n",
    "x_trainScaler = scaler.transform(x_train) # 将规则应用于训练集\n",
    "x_testScaler = scaler.transform(x_test)  # 将规则应用于测试集\n",
    "\n",
    "#PCA降维：\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=3).fit(x_train)  # n_components设置降维到n维度\n",
    "x_trainPca = pca.transform(x_train) # 将规则应用于训练集\n",
    "x_testPca = pca.transform(x_test)  # 将规则应用于测试集\n",
    "\n",
    "#LDA降维：\n",
    "from sklearn.lda import LDA\n",
    "lda = LDA(n_components=3).fit(x_train)  # n_components设置降维到n维度\n",
    "x_trainLda = lda.transform(x_train) # 将规则应用于训练集\n",
    "x_testLda = lda.transform(x_test)  # 将规则应用于测试集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征筛选"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Filter法：\n",
    "from sklearn.feature_selection import SelectKBest, f_classif\n",
    "selector = SelectKBest(f_classif, k=4) -- 用f_classif方法，设定数目为4\n",
    "a=selector.fit(x,y)\n",
    "print(np.array(a.scores_),'\\n',a.get_support())  --  输出得分及选择的结果\n",
    "\n",
    "Wrapper法（递归法）：\n",
    "from sklearn.linear_model import LinearRegression --导入基模型\n",
    "from sklearn.feature_selection import RFE  -- 导入RFE模块\n",
    "model1 = LinearRegression()   -- 建立一个线性模型\n",
    "rfe = RFE(model1,4)           -- 进行多轮训练，设置筛选特征数目为4个\n",
    "rfe = rfe.fit(x,y)            -- 模型的拟合训练\n",
    "print(rfe.support_)           -- 输出特征的选择结果\n",
    "print(rfe.ranking_)           -- 特征的选择排名"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据预处理中fit(),transform()与fit_transform()的区别\n",
    "\n",
    "\n",
    "1. Fit(): 就是求得训练集X的均值啊，方差啊，最大值\n",
    "2. Transform():在Fit的基础上，进行标准化，降维，归一化等操作（看具体用的是哪个工具，如PCA，StandardScaler等）\n",
    "3. Fit_transform()：fit_transform是fit和transform的组合，既包括了训练又包含了转换。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "sc = StandardScaler()\n",
    "sc.fit_tranform(X_train)\n",
    "sc.tranform(X_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
